Problem Set 1 by Eric Huang and Cameron Wang

Q1, Drop the fur_color using pd.drop

InĀ [3]:
# Import Libraries
import numpy as np # numpy
import pandas as pd # pandas for datasets
from skimage.util import img_as_int
InĀ [2]:
import math
InĀ [4]:
data = {
    "height": [62, 55, 54, 50],
    "weight": [55, 44, 50, 48],
    "tail_length": [12, 2, 3, 5],
    "tail_width": [2, 3, 2, 4],
    "fur_color": ["brown", "yellow", "brown", "yellow"],
    "species": ["dog", "cat", "dog", "cat"]
}

Animal_Data = pd.DataFrame(data)
Animal_Data=Animal_Data.drop(columns=["fur_color"])
InĀ [5]:
target= Animal_Data["species"]
data = Animal_Data.loc[:,"height":"tail_width"]
InĀ [6]:
target
Out[6]:
0    dog
1    cat
2    dog
3    cat
Name: species, dtype: object
InĀ [7]:
pd.plotting.scatter_matrix(pd.DataFrame(data=data), alpha=0.2)
Out[7]:
array([[<Axes: xlabel='height', ylabel='height'>,
        <Axes: xlabel='weight', ylabel='height'>,
        <Axes: xlabel='tail_length', ylabel='height'>,
        <Axes: xlabel='tail_width', ylabel='height'>],
       [<Axes: xlabel='height', ylabel='weight'>,
        <Axes: xlabel='weight', ylabel='weight'>,
        <Axes: xlabel='tail_length', ylabel='weight'>,
        <Axes: xlabel='tail_width', ylabel='weight'>],
       [<Axes: xlabel='height', ylabel='tail_length'>,
        <Axes: xlabel='weight', ylabel='tail_length'>,
        <Axes: xlabel='tail_length', ylabel='tail_length'>,
        <Axes: xlabel='tail_width', ylabel='tail_length'>],
       [<Axes: xlabel='height', ylabel='tail_width'>,
        <Axes: xlabel='weight', ylabel='tail_width'>,
        <Axes: xlabel='tail_length', ylabel='tail_width'>,
        <Axes: xlabel='tail_width', ylabel='tail_width'>]], dtype=object)
No description has been provided for this image
InĀ [8]:
# Shuffle the DataFrame rows
data = data.sample(frac=1).reset_index(drop=True)

# We will use all 4 examples as trainingset
InĀ [9]:
data
Out[9]:
height weight tail_length tail_width
0 54 50 3 2
1 55 44 2 3
2 50 48 5 4
3 62 55 12 2
InĀ [10]:
# As required
k = 3
testX = pd.DataFrame(
    data = {
    "height": [55],
    "weight": [57],
    "tail_length": [8],
    "tail_width": [5],
}
)
distances = pd.DataFrame(
    data = {
        "Distance":[0,0,0,0]
    }
)
for i in range(0,3):
    row = data.iloc[i]
    distance = 0
    for j in range(0,3):
        distance = np.sqrt(np.sum(np.square(row[j] - testX.iloc[0][j])))
    distances["Distance"][i] = distance
/var/folders/mj/y3mghvl90p91nkgx8lglzrqw0000gn/T/ipykernel_4384/1548940541.py:20: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  distance = np.sqrt(np.sum(np.square(row[j] - testX.iloc[0][j])))
/var/folders/mj/y3mghvl90p91nkgx8lglzrqw0000gn/T/ipykernel_4384/1548940541.py:21: FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  distances["Distance"][i] = distance
InĀ [11]:
distances
Out[11]:
Distance
0 5
1 6
2 3
3 0
InĀ [12]:
print(distances)
   Distance
0         5
1         6
2         3
3         0
InĀ [13]:
indices = np.argsort(distances["Distance"][1:])
InĀ [14]:
neighborYs = target[indices]
InĀ [15]:
print(neighborYs)
2    dog
1    cat
0    dog
Name: species, dtype: object

Final Guess is 2 because there are 2 dogs and 1 cats in the 3 nearest neighbors

Weight is the most important feature because it contributes to distances the most ( Biggest Gap between training and testing data)

One Hot Encoding for Q2

InĀ [16]:
# Use number instead of pandas

data = [
    [62, 55, 12, 2, 'brown', 'dog'],
    [55, 44, 2, 3, 'yellow', 'cat'],
    [54, 50, 3, 2, 'brown', 'dog'],
    [50, 48, 5, 4, 'yellow', 'cat']
]

x' = \frac((x - x_min),( x_max-x_min))

InĀ [17]:
fur_color_map = {'brown': [1, 0], 'yellow': [0, 1]}
species_map = {'dog': [1, 0], 'cat': [0, 1]}

X = []
y = []
for row in data:
    numerical_features = row[:4]
    fur_color_encoded = fur_color_map[row[4]]
    species_encoded = species_map[row[5]]
    X.append(numerical_features + fur_color_encoded)
    y.append(species_encoded)
trainX = np.array(X)
trainY = np.array(y)
InĀ [18]:
numerical_features = trainX[:, :4]
min_vals = numerical_features.min(axis=0)
max_vals = numerical_features.max(axis=0)
scaled_numerical_features = (numerical_features - min_vals) / (max_vals - min_vals)

# Combine scaled numerical features with encoded categorical data
trainX_normalized = np.hstack([scaled_numerical_features, trainX[:, 4:]])

Here is my normalized dataset:

print(trainX_normalized)
InĀ [19]:
print(trainX_normalized)
[[1.         1.         1.         0.         1.         0.        ]
 [0.41666667 0.         0.         0.5        0.         1.        ]
 [0.33333333 0.54545455 0.1        0.         1.         0.        ]
 [0.         0.36363636 0.3        1.         0.         1.        ]]

Q3:

InĀ [20]:
testX = np.array([55,57,8,5,0,1])
InĀ [21]:
k=3
# 1. calculate distances between test examples and training examples
distances = np.zeros((len(trainX), len(testX)))
for i in range(len(trainX)):
    for j in range(len(testX)):
        distances[i,j] = np.sqrt(np.sum(np.square(trainX[i] - testX[j])))

# 2. for each data point, select k closest training instances
InĀ [22]:
print(distances)
[[103.18914672 107.07007051  72.73238618  76.47221718  83.77350416
   82.21921916]
 [107.58717396 111.79892665  60.65476074  63.83572667  70.5336799
   69.07242576]
 [107.14476189 111.28342195  63.67102952  66.93280212  73.68853371
   72.22188034]
 [105.43244282 109.67223897  59.1776985   62.57795139  69.61321714
   68.08817812]]
InĀ [23]:
indices = np.argsort(distances[:,1])[0:k]
neighborYs = trainY[indices]
InĀ [24]:
distances
Out[24]:
array([[103.18914672, 107.07007051,  72.73238618,  76.47221718,
         83.77350416,  82.21921916],
       [107.58717396, 111.79892665,  60.65476074,  63.83572667,
         70.5336799 ,  69.07242576],
       [107.14476189, 111.28342195,  63.67102952,  66.93280212,
         73.68853371,  72.22188034],
       [105.43244282, 109.67223897,  59.1776985 ,  62.57795139,
         69.61321714,  68.08817812]])
InĀ [25]:
print(neighborYs)
[[1 0]
 [0 1]
 [1 0]]

As seen here in the neighborYs's Array, we have 2 dogs ( index 0 and 2) and 2 cat ( index 1) So the prediction does not change, answer here is also dog.

Q4 & 5

InĀ [27]:
from IPython.display import Image
Image("Q4_1.jpeg")
Out[27]:
No description has been provided for this image
  1. matrix = [ [3,3,3], [0,6,6] [9,0,9]

]

  1. matrix = [ [35]

]

InĀ [28]:
Image("Q4_2.jpeg")
Out[28]:
No description has been provided for this image
  1. matrix = [ [3,3,5], [0,0,0], [3,3,5], ]

matrix = [ [10,1,10], [1,5,5], [10,5,14] ]

InĀ [29]:
Image("Question 5.jpg")
Out[29]:
No description has been provided for this image

w1 = 1, w2 = 2, w3 = 1

InĀ [5]:
# Run this to generate pdf
!jupyter nbconvert --to html PS1.ipynb
[NbConvertApp] Converting notebook PS1.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 4 image(s).
[NbConvertApp] Writing 9495730 bytes to PS1.html